@inproceedings{alaa2018limits,
	title={Limits of estimating heterogeneous treatment effects: Guidelines for practical algorithm design},
	author={Alaa, Ahmed and Schaar, Mihaela},
	booktitle={International Conference on Machine Learning},
	pages={129--138},
	year={2018}
}
@article{kunzel2019metalearners,
	title={Metalearners for estimating heterogeneous treatment effects using machine learning},
	author={K{\"u}nzel, S{\"o}ren R and Sekhon, Jasjeet S and Bickel, Peter J and Yu, Bin},
	journal={Proceedings of the National Academy of Sciences},
	volume={116},
	number={10},
	pages={4156--4165},
	year={2019},
	publisher={National Acad Sciences}
}
@article{nie2017quasi,
	title={Quasi-oracle estimation of heterogeneous treatment effects},
	author={Nie, Xinkun and Wager, Stefan},
	journal={arXiv preprint arXiv:1712.04912},
	year={2017}
}
@article{imbens2009recent,
	title={Recent developments in the econometrics of program evaluation},
	author={Imbens, Guido W and Wooldridge, Jeffrey M},
	journal={Journal of economic literature},
	volume={47},
	number={1},
	pages={5--86},
	year={2009}
}
@inproceedings{shalit2017estimating,
	title={Estimating individual treatment effect: generalization bounds and algorithms},
	author={Shalit, Uri and Johansson, Fredrik D and Sontag, David},
	booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
	pages={3076--3085},
	year={2017},
	organization={JMLR. org}
}
@article{athey2016recursive,
	title={Recursive partitioning for heterogeneous causal effects},
	author={Athey, Susan and Imbens, Guido},
	journal={Proceedings of the National Academy of Sciences},
	volume={113},
	number={27},
	pages={7353--7360},
	year={2016},
	publisher={National Acad Sciences}
}
@article{hahn2017bayesian,
	author = {{Hahn}, P. Richard and {Murray}, Jared S. and {Carvalho}, Carlos},
	title = "{Bayesian regression tree models for causal inference: regularization, confounding, and heterogeneous effects}",
	journal = {arXiv e-prints},
	keywords = {Statistics - Methodology},
	year = "2017",
	month = "Jun",
	eid = {arXiv:1706.09523},
	pages = {arXiv:1706.09523},
	archivePrefix = {arXiv},
	eprint = {1706.09523},
	primaryClass = {stat.ME},
	adsurl = {https://ui.adsabs.harvard.edu/abs/2017arXiv170609523H},
	adsnote = {Provided by the SAO/NASA Astrophysics Data System}
}
@article{athey2019generalized,
  title={Generalized random forests},
  author={Athey, Susan and Tibshirani, Julie and Wager, Stefan and others},
  journal={The Annals of Statistics},
  volume={47},
  number={2},
  pages={1148--1178},
  year={2019},
  publisher={Institute of Mathematical Statistics}
}
@inproceedings{hartford2017deep,
  title={Deep IV: A flexible approach for counterfactual prediction},
  author={Hartford, Jason and Lewis, Greg and Leyton-Brown, Kevin and Taddy, Matt},
  booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
  pages={1414--1423},
  year={2017},
  organization={JMLR. org}
}
@article{oprescu2018orthogonal,
  author    = {Miruna Oprescu and
               Vasilis Syrgkanis and
               Zhiwei Steven Wu},
  title     = {Orthogonal Random Forest for Heterogeneous Treatment Effect Estimation},
  journal   = {CoRR},
  volume    = {abs/1806.03467},
  year      = {2018},
  url       = {http://arxiv.org/abs/1806.03467},
  archivePrefix = {arXiv},
  eprint    = {1806.03467},
  timestamp = {Mon, 13 Aug 2018 16:46:26 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1806-03467},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@ARTICLE{Gutierrez2016-co,
  title   = "Causal Inference and Uplift Modeling A review of the literature",
  author  = "Gutierrez, Pierre and Gerardy, Jean-Yves",
  journal = "JMLR: Workshop and Conference Proceedings 67",
  year    =  2016
}

@ARTICLE{Rzepakowski2012-br,
  title    = "Decision trees for uplift modeling with single and multiple
              treatments",
  author   = "Rzepakowski, Piotr and Jaroszewicz, Szymon",
  abstract = "Most classification approaches aim at achieving high prediction
              accuracy on a given dataset. However, in most practical cases,
              some action such as mailing an offer or treating a patient is to
              be taken on the classified objects, and we should model not the
              class probabilities themselves, but instead, the change in class
              probabilities caused by the action. The action should then be
              performed on those objects for which it will be most profitable.
              This problem is known as uplift modeling, differential response
              analysis, or true lift modeling, but has received very little
              attention in machine learning literature. An important
              modification of the problem involves several possible actions,
              when for each object, the model must also decide which action
              should be used in order to maximize profit. In this paper, we
              present tree-based classifiers designed for uplift modeling in
              both single and multiple treatment cases. To this end, we design
              new splitting criteria and pruning methods. The experiments
              confirm the usefulness of the proposed approaches and show
              significant improvement over previous uplift modeling techniques.",
  journal  = "Knowl. Inf. Syst.",
  volume   =  32,
  number   =  2,
  pages    = "303--327",
  month    =  aug,
  year     =  2012
}

@inproceedings{Zhao2017-kg,
  title         = "Uplift Modeling with Multiple Treatments and General
                   Response Types",
  author        = "Zhao, Yan and Fang, Xiao and Simchi-Levi, David",
  abstract      = "Randomized experiments have been used to assist
                   decision-making in many areas. They help people select the
                   optimal treatment for the test population with certain
                   statistical guarantee. However, subjects can show
                   significant heterogeneity in response to treatments. The
                   problem of customizing treatment assignment based on subject
                   characteristics is known as uplift modeling, differential
                   response analysis, or personalized treatment learning in
                   literature. A key feature for uplift modeling is that the
                   data is unlabeled. It is impossible to know whether the
                   chosen treatment is optimal for an individual subject
                   because response under alternative treatments is unobserved.
                   This presents a challenge to both the training and the
                   evaluation of uplift models. In this paper we describe how
                   to obtain an unbiased estimate of the key performance metric
                   of an uplift model, the expected response. We present a new
                   uplift algorithm which creates a forest of randomized trees.
                   The trees are built with a splitting criterion designed to
                   directly optimize their uplift performance based on the
                   proposed evaluation method. Both the evaluation method and
                   the algorithm apply to arbitrary number of treatments and
                   general response types. Experimental results on synthetic
                   data and industry-provided data show that our algorithm
                   leads to significant performance improvement over other
                   applicable methods.",
  booktitle={Proceedings of the 2017 SIAM International Conference on Data Mining},
  pages={588--596},
  year={2017},
  organization={SIAM}
}

@INPROCEEDINGS{Guelman2012-bx,
  title     = "Random Forests for Uplift Modeling: An Insurance Customer
               Retention Case",
  booktitle = "Modeling and Simulation in Engineering, Economics and Management",
  author    = "Guelman, Leo and Guill{\'e}n, Montserrat and
               P{\'e}rez-Mar{\'\i}n, Ana M",
  abstract  = "Models of customer churn are based on historical data and are
               used to predict the probability that a client switches to
               another company. We address customer retention in insurance.
               Rather than concentrating on those customers with high
               probability of leaving, we propose a new procedure that can be
               used to identify the target customers who are likely to respond
               positively to a retention activity. Our approach is based on
               random forests and can be useful to anticipate the success of
               marketing actions aimed at reducing customer attrition. We also
               discuss the type of insurance portfolio database that can be
               used for this purpose.",
  publisher = "Springer Berlin Heidelberg",
  pages     = "123--133",
  year      =  2012
}

@ARTICLE{Guelman2015-qe,
  title     = "Uplift Random Forests",
  author    = "Guelman, Leo and Guill{\'e}n, Montserrat and
               P{\'e}rez-Mar{\'\i}n, Ana M",
  abstract  = "Conventional supervised statistical learning models aim to
               achieve high accuracy in predicting the value of an outcome
               measure based on a number of input measures. However, in many
               applications, some type of action is randomized on the
               observational units. This is the case, for example, in
               treatment/control settings, such as those usually encountered in
               marketing and clinical trial applications. In these situations,
               we may not necessarily be interested in predicting the outcome
               itself, but in estimating the expected change in the outcome as
               a result of the action. This is precisely the idea behind uplift
               models, which, despite their many practical applications, have
               received little attention in the literature. In this article, we
               extend the state-of-the-art research in this area by proposing a
               new approach based on Random Forests. We perform carefully
               designed experiments using simple simulation models to
               illustrate some of the properties of the proposed method. In
               addition, we present evidence on a dataset pertaining to a large
               Canadian insurer on a customer retention case. The results
               confirm the effectiveness of the proposed method and show
               favorable performance relative to other existing uplift modeling
               approaches.",
  journal   = "Cybern. Syst.",
  publisher = "Taylor \& Francis",
  volume    =  46,
  number    = "3-4",
  pages     = "230--248",
  month     =  may,
  year      =  2015
}


@ARTICLE{noauthor_undated-xm,
  title    = "Estimating Heterogeneous Treatment Effects Using Neural Networks
              With The {Y-Learner}",
  author   = "Stadie, Bradly C and K{\"u}nzel, S{\"o}ren R and Vemuri, Nikita
              and Sekhon, Jasjeet S",
  abstract = "We develop the Y-learner for estimating heterogeneous treatment
              effects in experimental and observational studies. The Y-learner
              is designed to leverage the abilities of neural networks to
              optimize multiple objectives and continually update, which allows
              for better pooling of underlying feature information between
              treatment and control groups. We evaluate the Y-learner on three
              test problems: (1) A set of six simulated data benchmarks from
              the literature. (2) A real-world large-scale experiment on voter
              persuasion. (3) A task from the literature that estimates
              artificially generated treatment effects on MNIST didgits. The
              Y-learner achieves state of the art results on two of the three
              tasks. On the MNIST task, it gets the second best results.",
  month    =  sep,
  year     =  2018
}

@ARTICLE{Kunzel2018-sn,
  title         = "Transfer Learning for Estimating Causal Effects using Neural
                   Networks",
  author        = "K{\"u}nzel, S{\"o}ren R and Stadie, Bradly C and Vemuri,
                   Nikita and Ramakrishnan, Varsha and Sekhon, Jasjeet S and
                   Abbeel, Pieter",
  abstract      = "We develop new algorithms for estimating heterogeneous
                   treatment effects, combining recent developments in transfer
                   learning for neural networks with insights from the causal
                   inference literature. By taking advantage of transfer
                   learning, we are able to efficiently use different data
                   sources that are related to the same underlying causal
                   mechanisms. We compare our algorithms with those in the
                   extant literature using extensive simulation studies based
                   on large-scale voter persuasion experiments and the MNIST
                   database. Our methods can perform an order of magnitude
                   better than existing benchmarks while using a fraction of
                   the data.",
  month         =  aug,
  year          =  2018,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1808.07804"
}

@ARTICLE{Friedberg2018-pb,
  title         = "Local Linear Forests",
  author        = "Friedberg, Rina and Tibshirani, Julie and Athey, Susan and
                   Wager, Stefan",
  abstract      = "Random forests are a powerful method for non-parametric
                   regression, but are limited in their ability to fit smooth
                   signals, and can show poor predictive performance in the
                   presence of strong, smooth effects. Taking the perspective
                   of random forests as an adaptive kernel method, we pair the
                   forest kernel with a local linear regression adjustment to
                   better capture smoothness. The resulting procedure, local
                   linear forests, enables us to improve on asymptotic rates of
                   convergence for random forests with smooth signals, and
                   provides substantial gains in accuracy on both real and
                   simulated data. We prove a central limit theorem and propose
                   a computationally efficient construction for confidence
                   intervals.",
  month         =  jul,
  year          =  2018,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1807.11408"
}
@article{athey2017efficient,
  title={Efficient policy learning},
  author={Athey, Susan and Wager, Stefan},
  journal={arXiv preprint arXiv:1702.02896},
  year={2017}
}

@inproceedings{ijcai2019-248,
  title     = {Unit Selection Based on Counterfactual Logic},
  author    = {Li, Ang and Pearl, Judea},
  booktitle = {Proceedings of the Twenty-Eighth International Joint Conference on
               Artificial Intelligence, {IJCAI-19}},
  publisher = {International Joint Conferences on Artificial Intelligence Organization},
  pages     = {1793--1799},
  year      = {2019},
  month     = {7},
  doi       = {10.24963/ijcai.2019/248},
  url       = {https://doi.org/10.24963/ijcai.2019/248},
}

@book{angrist2008mostly,
  title={Mostly harmless econometrics: An empiricist's companion},
  author={Angrist, Joshua D and Pischke, J{\"o}rn-Steffen},
  year={2008},
  publisher={Princeton university press}
}

@book{pearl2009causality,
  title={Causality},
  author={Pearl, Judea},
  year={2009},
  publisher={Cambridge university press}
}

@inproceedings{zhao2019uplift,
  title={Uplift modeling for multiple treatments with cost optimization},
  author={Zhao, Zhenyu and Harinen, Totte},
  booktitle={2019 IEEE International Conference on Data Science and Advanced Analytics (DSAA)},
  pages={422--431},
  year={2019},
  organization={IEEE}
}

@article{stuart2010matching,
  title={Matching methods for causal inference: A review and a look forward},
  author={Stuart, Elizabeth A},
  journal={Statistical science: a review journal of the Institute of Mathematical Statistics},
  volume={25},
  number={1},
  pages={1},
  year={2010},
  publisher={NIH Public Access}
}

@article{hansotia2002ddp,
  title={Incremental value modeling},
  author={Behram, Hansotia and Brad, Rukstales},
  journal={Journal of Interactive Marketing},
  volume={16},
  pages={35-46},
  year={2002},
}

@article{su2009subgroup,
  title={Subgroup analysis via recursive partitioning.},
  author={Su, Xiaogang and Tsai, Chih-Ling and Wang, Hansheng and Nickerson, David M and Li, Bogong},
  journal={Journal of Machine Learning Research},
  volume={10},
  number={2},
  year={2009}
}

@article{su2012facilitating,
  title={Facilitating score and causal inference trees for large observational studies},
  author={Su, Xiaogang and Kang, Joseph and Fan, Juanjuan and Levine, Richard A and Yan, Xin},
  journal={Journal of Machine Learning Research},
  volume={13},
  pages={2955},
  year={2012}
}

@article{rossler2022the,
  title={The Best of Two Worlds: Using Recent Advances from Uplift Modeling and Heterogeneous Treatment Effects to Optimize Targeting
  Policies},
  author={R{\"o}{\ss}ler, Jannik and Guse, Richard and Schoder, Detlef},
  journal={International Conference on Information Systems},
  year={2022}
}

@article{https://doi.org/10.1111/1468-0262.00442,
author = {Hirano, Keisuke and Imbens, Guido W. and Ridder, Geert},
title = {Efficient Estimation of Average Treatment Effects Using the Estimated Propensity Score},
journal = {Econometrica},
volume = {71},
number = {4},
pages = {1161-1189},
keywords = {Propensity score, treatment effects, semiparametric efficiency, sieve estimator},
doi = {https://doi.org/10.1111/1468-0262.00442},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/1468-0262.00442},
eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1111/1468-0262.00442},
abstract = {We are interested in estimating the average effect of a binary treatment on a scalar outcome. If assignment to the treatment is exogenous or unconfounded, that is, independent of the potential outcomes given covariates, biases associated with simple treatment-control average comparisons can be removed by adjusting for differences in the covariates. Rosenbaum and Rubin (1983) show that adjusting solely for differences between treated and control units in the propensity score removes all biases associated with differences in covariates. Although adjusting for differences in the propensity score removes all the bias, this can come at the expense of efficiency, as shown by Hahn (1998), Heckman, Ichimura, and Todd (1998), and Robins, Mark, and Newey (1992). We show that weighting by the inverse of a nonparametric estimate of the propensity score, rather than the true propensity score, leads to an efficient estimate of the average treatment effect. We provide intuition for this result by showing that this estimator can be interpreted as an empirical likelihood estimator that efficiently incorporates the information about the propensity score.},
year = {2003}
}

@article{https://doi.org/10.1002/sim.6607,
author = {Austin, Peter C. and Stuart, Elizabeth A.},
title = {Moving towards best practice when using inverse probability of treatment weighting (IPTW) using the propensity score to estimate causal treatment effects in observational studies},
journal = {Statistics in Medicine},
volume = {34},
number = {28},
pages = {3661-3679},
keywords = {observational study, propensity score, inverse probability of treatment weighting, IPTW, causal inference},
doi = {https://doi.org/10.1002/sim.6607},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/sim.6607},
eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/sim.6607},
abstract = {The propensity score is defined as a subject's probability of treatment selection, conditional on observed baseline covariates. Weighting subjects by the inverse probability of treatment received creates a synthetic sample in which treatment assignment is independent of measured baseline covariates. Inverse probability of treatment weighting (IPTW) using the propensity score allows one to obtain unbiased estimates of average treatment effects. However, these estimates are only valid if there are no residual systematic differences in observed baseline characteristics between treated and control subjects in the sample weighted by the estimated inverse probability of treatment. We report on a systematic literature review, in which we found that the use of IPTW has increased rapidly in recent years, but that in the most recent year, a majority of studies did not formally examine whether weighting balanced measured covariates between treatment groups. We then proceed to describe a suite of quantitative and qualitative methods that allow one to assess whether measured baseline covariates are balanced between treatment groups in the weighted sample. The quantitative methods use the weighted standardized difference to compare means, prevalences, higher-order moments, and interactions. The qualitative methods employ graphical methods to compare the distribution of continuous baseline covariates between treated and control subjects in the weighted sample. Finally, we illustrate the application of these methods in an empirical case study. We propose a formal set of balance diagnostics that contribute towards an evolving concept of ‘best practice’ when using IPTW to estimate causal treatment effects using observational data. © 2015 The Authors. Statistics in Medicine Published by John Wiley \& Sons Ltd.},
year = {2015}
}

@article{10.1257/jep.15.4.69,
Author = {Angrist, Joshua D. and Krueger, Alan B.},
Title = {Instrumental Variables and the Search for Identification: From Supply and Demand to Natural Experiments},
Journal = {Journal of Economic Perspectives},
Volume = {15},
Number = {4},
Year = {2001},
Month = {December},
Pages = {69-85},
DOI = {10.1257/jep.15.4.69},
URL = {https://www.aeaweb.org/articles?id=10.1257/jep.15.4.69}}

@article{chen2020causalml,
  title={Causalml: Python package for causal machine learning},
  author={Chen, Huigang and Harinen, Totte and Lee, Jeong-Yoon and Yung, Mike and Zhao, Zhenyu},
  journal={arXiv preprint arXiv:2002.11631},
  year={2020}
}

@article{zhao2020feature,
  title={Feature Selection Methods for Uplift Modeling},
  author={Zhao, Zhenyu and Zhang, Yumin and Harinen, Totte and Yung, Mike},
  journal={arXiv preprint arXiv:2005.03447},
  year={2020}
}

@article{tian2000probabilities,
  title={Probabilities of causation: Bounds and identification},
  author={Tian, Jin and Pearl, Judea},
  journal={Annals of Mathematics and Artificial Intelligence},
  volume={28},
  number={1},
  pages={287--313},
  year={2000},
  publisher={Springer}
}

@book{balke1995probabilistic,
  title={Probabilistic counterfactuals: semantics, computation, and applications},
  author={Balke, Alexander Abraham},
  year={1995},
  publisher={University of California, Los Angeles}
}
  
@misc{kennedy2020optimal,
      title={Optimal doubly robust estimation of heterogeneous causal effects},
      author={Edward H. Kennedy},
      year={2020},
      eprint={2004.14497},
      archivePrefix={arXiv},
      primaryClass={math.ST}
}

@article{10.1111/ectj.12097,
    author = {Chernozhukov, Victor and Chetverikov, Denis and Demirer, Mert and Duflo, Esther and Hansen, Christian and Newey, Whitney and Robins, James},
    title = "{Double/debiased machine learning for treatment and structural parameters}",
    journal = {The Econometrics Journal},
    volume = {21},
    number = {1},
    pages = {C1-C68},
    year = {2018},
    month = {01},
    abstract = "{We revisit the classic semi‐parametric problem of inference on a low‐dimensional parameter θ0 in the presence of high‐dimensional nuisance parameters η0. We depart from the classical setting by allowing for η0 to be so high‐dimensional that the traditional assumptions (e.g. Donsker properties) that limit complexity of the parameter space for this object break down. To estimate η0, we consider the use of statistical or machine learning (ML) methods, which are particularly well suited to estimation in modern, very high‐dimensional cases. ML methods perform well by employing regularization to reduce variance and trading off regularization bias with overfitting in practice. However, both regularization bias and overfitting in estimating η0 cause a heavy bias in estimators of θ0 that are obtained by naively plugging ML estimators of η0 into estimating equations for θ0. This bias results in the naive estimator failing to be N−1/2 consistent, where N is the sample size. We show that the impact of regularization bias and overfitting on estimation of the parameter of interest θ0 can be removed by using two simple, yet critical, ingredients: (1) using Neyman‐orthogonal moments/scores that have reduced sensitivity with respect to nuisance parameters to estimate θ0; (2) making use of cross‐fitting, which provides an efficient form of data‐splitting. We call the resulting set of methods double or debiased ML (DML). We verify that DML delivers point estimators that concentrate in an N−1/2‐neighbourhood of the true parameter values and are approximately unbiased and normally distributed, which allows construction of valid confidence statements. The generic statistical theory of DML is elementary and simultaneously relies on only weak theoretical requirements, which will admit the use of a broad array of modern ML methods for estimating the nuisance parameters, such as random forests, lasso, ridge, deep neural nets, boosted trees, and various hybrids and ensembles of these methods. We illustrate the general theory by applying it to provide theoretical properties of the following: DML applied to learn the main regression parameter in a partially linear regression model; DML applied to learn the coefficient on an endogenous variable in a partially linear instrumental variables model; DML applied to learn the average treatment effect and the average treatment effect on the treated under unconfoundedness; DML applied to learn the local average treatment effect in an instrumental variables setting. In addition to these theoretical applications, we also illustrate the use of DML in three empirical examples.}",
    issn = {1368-4221},
    doi = {10.1111/ectj.12097},
    url = {https://doi.org/10.1111/ectj.12097},
    eprint = {https://academic.oup.com/ectj/article-pdf/21/1/C1/27684918/ectj00c1.pdf},
}

@book{tmle,
author = {Laan, Mark and Rose, Sherri},
year = {2011},
month = {01},
pages = {},
title = {Targeted Learning: Causal Inference for Observational and Experimental Data},
publisher={Springer-Verlag New York},
isbn = {978-1-4419-9781-4},
doi = {10.1007/978-1-4419-9782-1}
}
